{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import urllib\n",
    "import pandas as pd\n",
    "from neo4j import GraphDatabase\n",
    "\n",
    "driver = GraphDatabase.driver('bolt://localhost:7687', auth=('neo4j', 'letmein'))\n",
    "\n",
    "def run_query(query, params={}):\n",
    "    with driver.session() as session:\n",
    "        result = session.run(query, params)\n",
    "        return pd.DataFrame([r.values() for r in result], columns=result.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ie_pipeline(text, relation_threshold=0.9, entities_threshold=0.8):\n",
    "    # Prepare the URL.\n",
    "    data = urllib.parse.urlencode([\n",
    "        (\"text\", text), (\"relation_threshold\", relation_threshold),\n",
    "        (\"entities_threshold\", entities_threshold)])\n",
    "    \n",
    "    url = \"http://localhost:5000?\" + data\n",
    "    req = urllib.request.Request(url, data=data.encode(\"utf8\"), method=\"GET\")\n",
    "    with urllib.request.urlopen(req, timeout=150) as f:\n",
    "        response = f.read()\n",
    "        response = json.loads(response.decode(\"utf8\"))\n",
    "    # Output the annotations.\n",
    "    return response"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Elon Musk example input"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "example_data = ie_pipeline(\"\"\"\n",
    "Elon Musk is a business magnate, industrial designer, and engineer.\n",
    "He is the founder, CEO, CTO, and chief designer of SpaceX.\n",
    "He is also early investor, CEO, and product architect of Tesla, Inc.\n",
    "He is also the founder of The Boring Company and the co-founder of Neuralink. \n",
    "A centibillionaire, Musk became the richest person in the world in January 2021, with an estimated net worth of $185 billion at the time, surpassing Jeff Bezos.\n",
    "Musk was born to a Canadian mother and South African father and raised in Pretoria, South Africa.\n",
    "He briefly attended the University of Pretoria before moving to Canada aged 17 to attend Queen's University.\n",
    "He transferred to the University of Pennsylvania two years later, where he received dual bachelor's degrees in economics and physics.\n",
    "He moved to California in 1995 to attend Stanford University, but decided instead to pursue a business career.\n",
    "He went on co-founding a web software company Zip2 with his brother Kimbal Musk.\n",
    "  \"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'entities': [{'label': 'Organization', 'title': 'Tesla, Inc.', 'wikiId': 'Q478214'}, {'label': 'Person', 'title': 'Jeff Bezos', 'wikiId': 'Q312556'}, {'label': 'Organization', 'title': 'University of Pennsylvania', 'wikiId': 'Q49117'}, {'label': 'Person', 'title': 'Elon Musk', 'wikiId': 'Q317521'}, {'label': 'Organization', 'title': 'The Boring Company', 'wikiId': 'Q28874479'}, {'label': 'Organization', 'title': 'Stanford University', 'wikiId': 'Q41506'}, {'label': 'Person', 'title': 'Kimbal Musk', 'wikiId': 'Q6409751'}, {'label': 'Organization', 'title': 'Pretoria', 'wikiId': 'Q3926'}, {'label': 'Organization', 'title': 'University of Pretoria', 'wikiId': 'Q604444'}, {'label': 'Organization', 'title': 'SpaceX', 'wikiId': 'Q193701'}, {'label': 'Organization', 'title': 'Neuralink', 'wikiId': 'Q29043471'}], 'relations': [{'source': 'The Boring Company', 'target': 'Neuralink', 'type': 'subsidiary'}, {'source': 'Tesla, Inc.', 'target': 'Elon Musk', 'type': 'owned by'}, {'source': 'Elon Musk', 'target': 'Neuralink', 'type': 'owned by'}, {'source': 'Neuralink', 'target': 'Elon Musk', 'type': 'owned by'}, {'source': 'Elon Musk', 'target': 'The Boring Company', 'type': 'owned by'}, {'source': 'Elon Musk', 'target': 'Tesla, Inc.', 'type': 'owned by'}, {'source': 'Elon Musk', 'target': 'University of Pretoria', 'type': 'work location'}, {'source': 'Elon Musk', 'target': 'Pretoria', 'type': 'work location'}, {'source': 'Elon Musk', 'target': 'University of Pennsylvania', 'type': 'work location'}, {'source': 'The Boring Company', 'target': 'Tesla, Inc.', 'type': 'owned by'}, {'source': 'Elon Musk', 'target': 'University of Pennsylvania', 'type': 'residence'}, {'source': 'University of Pennsylvania', 'target': 'Elon Musk', 'type': 'residence'}, {'source': 'Elon Musk', 'target': 'Kimbal Musk', 'type': 'sibling'}, {'source': 'The Boring Company', 'target': 'Elon Musk', 'type': 'owned by'}, {'source': 'Kimbal Musk', 'target': 'Elon Musk', 'type': 'sibling'}]}\n"
     ]
    }
   ],
   "source": [
    "print(example_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>'done'</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>done</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  'done'\n",
       "0   done"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import_direct_query = \"\"\"\n",
    "WITH $data as data\n",
    "UNWIND data.entities as entity\n",
    "MERGE (e:Entity{name:entity.title})\n",
    "ON CREATE SET e.wikiId = entity.wikiId\n",
    "WITH data, entity, e\n",
    "CALL apoc.create.addLabels(e,[entity.label]) YIELD node\n",
    "WITH data, count(*) as break_unwind\n",
    "UNWIND data.relations as relation\n",
    "MERGE (s:Entity{name:relation.source})\n",
    "MERGE (t:Entity{name:relation.target})\n",
    "WITH s,t,relation\n",
    "CALL apoc.create.relationship(s, relation.type, {}, t) \n",
    "YIELD rel\n",
    "RETURN distinct 'done'\n",
    "\"\"\"\n",
    "\n",
    "run_query(import_direct_query, {'data':example_data})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BBC news dataset example\n",
    "\n",
    "Make sure to download the dataset from https://www.kaggle.com/hgultekin/bbcnewsarchive.\n",
    "\n",
    "You will need to have the IE pipeline running on localhost:5000. You can run the following command to get it up and running:\n",
    "\n",
    "<code>\n",
    "    docker run -p 5000:5000 tomasonjo/trinityie\n",
    "</code> "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: []"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "run_query(\"MATCH (n) DETACH DELETE n\")\n",
    "run_query(\"CREATE CONSTRAINT IF NOT EXISTS ON (e:Entity) ASSERT e.name IS UNIQUE;\")\n",
    "run_query(\"CREATE INDEX rels IF NOT EXISTS FOR (n:Relation) ON (n.type);\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('bbc-news-data.csv', delimiter='\\t')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "import_refactored_query = \"\"\"\n",
    "UNWIND $params as value\n",
    "CREATE (a:Article{content:value.content})\n",
    "FOREACH (rel in value.ie.relations | \n",
    "  MERGE (s:Entity{name:rel.source})\n",
    "  MERGE (t:Entity{name:rel.target})\n",
    "  MERGE (s)-[:RELATION]->(r:Relation{type:rel.type})-[:RELATION]->(t)\n",
    "  MERGE (a)-[:MENTIONS_REL]->(r))\n",
    "WITH value, a\n",
    "UNWIND value.ie.entities as entity\n",
    "MERGE (e:Entity{name:entity.title})\n",
    "SET e.wikiId = entity.wikiId\n",
    "MERGE (a)-[:MENTIONS_ENT]->(e)\n",
    "WITH entity, e\n",
    "CALL apoc.create.addLabels(e,[entity.label]) YIELD node\n",
    "RETURN distinct 'done'\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run 500 articles through IE pipeline and store results to Neo4j"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "with driver.session() as session:\n",
    "    params = []\n",
    "    for i,article in list(data.iterrows())[:500]:\n",
    "        content = article['content']\n",
    "        ie_data = ie_pipeline(content)\n",
    "        params.append({'content':content, 'ie':ie_data})\n",
    "\n",
    "        if (len(params) % 100 == 0):\n",
    "            session.run(import_refactored_query, {'params':params})\n",
    "            params = []\n",
    "\n",
    "    session.run(update_query, {'params':params})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nlp",
   "language": "python",
   "name": "nlp"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
